library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(modelr)
library(viridis)
## Loading required package: viridisLite
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
library(latex2exp)
df = read_csv("../../analysis_data/all_data.csv")
## Rows: 81428 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): librarian, vectorizer, center, fields_of_study_0
## dbl (3): density, edginess, citations_per_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 7
## density edginess citations_per_year librarian vectorizer center field…¹
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 623. 0.721 8.86 S2 GPT2 hafenLowreds… Physics
## 2 1784. 0.528 59.7 S2 GPT2 hafenLowreds… Physics
## 3 1768. 0.591 20.4 S2 GPT2 hafenLowreds… Physics
## 4 1409. 0.487 1 S2 GPT2 hafenLowreds… Physics
## 5 1858. 0.552 14.2 S2 GPT2 hafenLowreds… Physics
## 6 1486. 0.435 11.3 S2 GPT2 hafenLowreds… Physics
## # … with abbreviated variable name ¹​fields_of_study_0
# TEMP: Filter first to frequent vals, this is disatisfying bandaid for BOW
# df <- df %>% group_by(density) %>% filter(n() >= 50)
# In fact, it disables viewing the other interesting ones. We might simply be unable to automatically facet, unless our data is already heavily transformed in a specific way for BOW/Word2Vec vectorizers vs. neural LM vecs
df_grouped_z <- df %>%
group_by(
vectorizer,
center
) %>%
mutate(
density_z = scale(density),
cpy_z = scale(citations_per_year),
)
# Filter to 2 stds, for both vars
df_grouped_zf <- df_grouped_z %>%
filter(
(
density_z >= -2
&
density_z <= 2
# density <= median(df_grouped_z$density)
),
(
cpy_z >= -2
&
cpy_z <= 0 # mean=0, and it might be the only way to see everything. But it also might exclude things; unfortunately this might need to be outsourced to a config and plots tweaked.
)
)
(
ggplot(
df_grouped_zf,
mapping=aes(
x=density_z,
# x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
y=citations_per_year,
# y=cpy_z,
)
)
+ geom_density_2d_filled(
contour_var = "ndensity",
# alpha=0.2,
)
+ scale_fill_viridis(option = "viridis", discrete = TRUE)
# + xlab("Density z-scaled")
+ xlab("Density")
+ ylab("Citations per year")
+ geom_smooth(color="orange", size=2, method="loess", span=.3)
+ geom_point(
alpha=0.05,
color="white",
size=1,
)
+ theme(
# axis_title_y=element_blank(),
axis.title=element_text(size=18),
)
# IMPORTANT: facet by the groupby variables,
# This should make the distribution no longer bimodal.
+ facet_grid(vectorizer ~ center)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'

# Use only one group.
df_test <- df_grouped_zf %>% filter(
vectorizer == "SciBERT",
center == "hafenLowredshiftLymanLimit2017"
)
(
ggplot(
df_test,
mapping=aes(
# x=density_z,
x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
y=citations_per_year,
# y=cpy_z,
)
)
+ geom_density_2d_filled(
contour_var = "ndensity",
# alpha=0.2,
)
+ scale_fill_viridis(option = "viridis", discrete = TRUE)
# + xlab("Density z-scaled")
+ xlab("Density")
+ ylab("Citations per year")
+ geom_smooth(color="orange", size=2, method="loess", span=.3)
+ geom_point(
alpha=0.05,
color="white",
size=1,
)
+ theme(
# axis_title_y=element_blank(),
axis.title=element_text(size=18),
)
)
## `geom_smooth()` using formula = 'y ~ x'

df_test <- df_grouped_zf %>% filter(
vectorizer == "BOW",
center == "hafenLowredshiftLymanLimit2017"
)
(
ggplot(
df_test,
mapping=aes(
# x=density_z,
x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
y=citations_per_year,
# y=cpy_z,
)
)
+ geom_density_2d_filled(
contour_var = "ndensity",
# alpha=0.2,
)
+ scale_fill_viridis(option = "viridis", discrete = TRUE)
# + xlab("Density z-scaled")
+ xlab("Density")
+ ylab("Citations per year")
+ geom_smooth(color="orange", size=2, method="loess", span=.3)
+ geom_point(
alpha=0.05,
color="white",
size=1,
)
+ theme(
# axis_title_y=element_blank(),
axis.title=element_text(size=18),
)
)
## `geom_smooth()` using formula = 'y ~ x'

df_test <- df_grouped_zf %>% filter(
vectorizer == "Word2Vec",
center == "hafenLowredshiftLymanLimit2017"
)
(
ggplot(
df_test,
mapping=aes(
# x=density_z,
x=density, # NOTE that we filter by z-scale, but can still plot the orig values.
y=citations_per_year,
# y=cpy_z,
)
)
+ geom_density_2d_filled(
contour_var = "ndensity",
# alpha=0.2,
)
+ scale_fill_viridis(option = "viridis", discrete = TRUE)
# + xlab("Density z-scaled")
+ xlab("Density")
+ ylab("Citations per year")
+ geom_smooth(color="orange", size=2, method="loess", span=.3)
+ geom_point(
alpha=0.05,
color="white",
size=1,
)
+ theme(
# axis_title_y=element_blank(),
axis.title=element_text(size=18),
)
)
## `geom_smooth()` using formula = 'y ~ x'
